

predict.CNVs<-function(dataseg,path,use.cohort=TRUE,smoothed=TRUE, glad=FALSE,gainloss.defined=FALSE,nmad=1)
{

if (glad==TRUE) smoothed<-TRUE 
library(randomForest)


segs<-dataseg$output
map<-dataseg$data[,1:2]

if (gainloss.defined==TRUE & ncol(segs)!=7) {print("should define gains/losses in the 7th column of segmentation output or set gainloss.defined=FALSE"); return()}

ns<-dim(dataseg$data)[2]-2
samnms<-names(dataseg$data)[-c(1,2)]

if (gainloss.defined==TRUE) 
{
madstr<-NULL
for (i in c(1:ns))
{resid<-dataseg$data[!is.na(dataseg$data[,2+i]),2+i]-rep(dataseg$output$seg.mean[dataseg$output[,1]==samnms[i]],dataseg$output$num.mark[dataseg$output[,1]==samnms[i]])
madstr<-c(madstr,mad(resid))
}
}
else
{
b<-GL(dataseg,nmad)
segs<-b[[1]]$output
madstr<-(b[[2]]-b[[3]])/2
}


segs2<-definepredictors(segs,madstr,map)

if (smoothed==TRUE & glad==FALSE & use.cohort==TRUE) a<-predict(finalrfsmo,newdata=segs2[segs2$is.cand==1,])

if (smoothed==FALSE & glad==FALSE & use.cohort==TRUE) a<-predict(finalrfnosmo,newdata=segs2[segs2$is.cand==1,])

if (smoothed==TRUE & glad==TRUE & use.cohort==TRUE) a<-predict(finalrfglad,newdata=segs2[segs2$is.cand==1,])

if (smoothed==TRUE & glad==FALSE & use.cohort==FALSE) a<-predict(finalrfnootherssmo,newdata=segs2[segs2$is.cand==1,])

if (smoothed==FALSE & glad==FALSE & use.cohort==FALSE) a<-predict(finalrfnoothersnosmo,newdata=segs2[segs2$is.cand==1,])

if (smoothed==TRUE & glad==TRUE & use.cohort==FALSE) a<-predict(finalrfnoothersglad,newdata=segs2[segs2$is.cand==1,])


segs2$predicted.CNV<-NA
segs2$predicted.CNV[segs2$is.cand==1]<-as.character(a)
return(segs2)
}



GL<-
function(seg,nmad)
{ns<-dim(seg$data)[2]-2
samnms<-names(seg$data)[-c(1,2)]
gainthres<-rep(NA,ns)
lossthres<-rep(NA,ns)
 seg$output[,7]<- "Normal"
for (i in c(1:ns))
{resid<-seg$data[!is.na(seg$data[,2+i]),2+i]-rep(seg$output$seg.mean[seg$output[,1]==samnms[i]],seg$output$num.mark[seg$output[,1]==samnms[i]])
madd<-mad(resid)
seg$output[seg$output[,1]==samnms[i]& seg$output$seg.mean>median(seg$data[,2+i],na.rm=T)+nmad*madd,7]<-"Gain"
seg$output[seg$output[,1]==samnms[i] & seg$output$seg.mean<median(seg$data[,2+i],na.rm=T)-nmad*madd,7]<-"Loss"
gainthres[i] <-median(seg$data[,2+i],na.rm=T)+nmad*madd
lossthres[i] <-median(seg$data[,2+i],na.rm=T)-nmad*madd

}
names(seg$output)[7]<-"state"
return(list(seg,gainthres,lossthres))
}


definepredictors<-function(segs,mads,map)
{

chrlist<-unique(segs$chrom)
samnms<-unique(segs$ID)
ns<-length(samnms)
tumors<-c(1:ns)
MX=2300000
alpha=0.01



segs$is.cand<-0
segs$is.physio<-0



##length
segs$length<-(segs$loc.end-segs$loc.start)#/1000






#combining all nearby that have total legnth<=MX

w <- which(segs$state=="Loss" &  segs$length<= MX)
nw<-length(w)
excl<-NULL
doubles<-w[-nw][w[-1]-w[-nw]==1]
i<-doubles[1]
if (length(doubles)>=1)
{
while(i<=max(doubles))
{
cons<-i
set<-i
if (cons<nrow(segs)-1)
{
while ( segs$state[cons+1]==segs$state[i] & segs$chrom[cons+1]==segs$chrom[i]  & segs$ID[cons+1]==segs$ID[i] & segs$length[cons+1]<=MX) 
{set<-c(set,cons+1)
cons<-cons+1
}
}
nset<-length(set)

if (segs$state[i]==segs$state[set[nset]] & segs$ID[i]==segs$ID[set[nset]] & segs$chrom[i]==segs$chrom[set[nset]] & segs$loc.end[set[nset]]-segs$loc.start[i]<=MX)
{segs$seg.mean[i]<-sum(segs$num.mark[set]*segs$seg.mean[set])/sum(segs$num.mark[set])
segs$num.mark[i]<-sum(segs$num.mark[i:set[nset]])
segs$loc.end[i]<-segs$loc.end[set[nset]]
segs$length[i]<-segs$loc.end[set[nset]]-segs$loc.start[i]
excl<-c(excl,set[-1])
}
if (i<=max(doubles) & !is.na(doubles[doubles>set[nset]][1])) i<-doubles[doubles>set[nset]][1] else i<-max(doubles)+1
}
}

w <- which(segs$state=="Gain" &  segs$length<= MX)
nw<-length(w)
doubles<-w[-nw][w[-1]-w[-nw]==1]
i<-doubles[1]
if (length(doubles)>=1)
{
while(i<=max(doubles))
{
cons<-i
set<-i
if (cons<nrow(segs)-1)
{
while ( segs$state[cons+1]==segs$state[i] & segs$chrom[cons+1]==segs$chrom[i]  & segs$ID[cons+1]==segs$ID[i] & segs$length[cons+1]<=MX) 
{set<-c(set,cons+1)
cons<-cons+1
}
}
nset<-length(set)

if (segs$state[i]==segs$state[set[nset]] & segs$ID[i]==segs$ID[set[nset]] & segs$chrom[i]==segs$chrom[set[nset]] & segs$loc.end[set[nset]]-segs$loc.start[i]<=MX)
{segs$seg.mean[i]<-sum(segs$num.mark[set]*segs$seg.mean[set])/sum(segs$num.mark[set])
segs$num.mark[i]<-sum(segs$num.mark[i:set[nset]])
segs$loc.end[i]<-segs$loc.end[set[nset]]
segs$length[i]<-segs$loc.end[set[nset]]-segs$loc.start[i]
excl<-c(excl,set[-1])
}
if (i<=max(doubles) & !is.na(doubles[doubles>set[nset]][1])) i<-doubles[doubles>set[nset]][1] else i<-max(doubles)+1
}

}

if (!is.null(excl)) segs<-segs[-excl,]

# 
##candidates
segs$is.cand<-0

for (i in tumors)
for (chr in chrlist)
{
w<-which(segs$chrom==chr & segs$ID==samnms[i])
n<-length(w)
this<-segs[w,]
if (n==1 & this$state[1]!="Normal" &this$length[1]<= MX) segs$is.cand[w][jj]<-1
else {
if (n>2){
for (jj in c(2:(n-1)))
{if (this$state[jj]!="Normal" & (all( this$state[c(jj-1,jj+1)]=="Normal") | (any( this$state[c(jj-1,jj+1)]=="Normal")  )) &   
 this$length[jj]<= MX) segs$is.cand[w][jj]<-1
} }
if (n>1)
{	
jj<-n
if (this$state[jj]!="Normal"  &  this$state[c(jj-1)]=="Normal"&   this$length[jj]<= MX ) segs$is.cand[w][jj]<-1
  }
jj<-1
if (this$state[jj]!="Normal"  &  this$state[c(jj+1)]=="Normal" &   this$length[jj]<= MX) segs$is.cand[w][jj]<-1
}
}



##physio

segs$is.physio<-0
for (i in c(1:6))
{
segs$is.physio[segs$chrom==phys$chrom[i] & segs$is.cand==1 & !(segs$loc.start>phys$end[i] | segs$loc.end<phys$start[i] )]<-1
}

segs$is.cand[segs$is.physio==1]<-0




##segmental dup

segs$segdup<-0
for (i in c(1:nrow(segrear)))
{segs$segdup[segs$chrom==segrear$Chr[i] & !(segs$loc.start>segrear$hg18end[i] | segs$loc.end<segrear$hg18start[i])]<-1
}



##other patients by state
segs$in.otherptsGainLoss<-"None"

for (i in which(segs$is.cand==1))
{w<-segs$is.cand==1 & segs$ID!=segs$ID[i] &  segs$ID %in% samnms[tumors] & segs$chrom==segs$chrom[i]
if (any(w)) 
{others<-segs$state[w][!(segs$loc.start[w]>segs$loc.end[i] | segs$loc.end[w]<segs$loc.start[i])] 
if (length(others)==0) segs$in.otherptsGainLoss[i]<-"None"
else if (all(others=="Loss")) segs$in.otherptsGainLoss[i]<-"LL"
else if (all(others=="Gain")) segs$in.otherptsGainLoss[i]<-"GG"
else if (any(others=="Loss") & any(others=="Gain")) segs$in.otherptsGainLoss[i]<-"LG"
}
}
segs$in.otherptsGainLoss<-as.factor( segs$in.otherptsGainLoss)
segs$in.otherptsGainLoss<-relevel(segs$in.otherptsGainLoss,"None")




##non-candidate Gains/Losses in the region in other patietns
segs$CNAs<-0

for (i in which(segs$is.cand==1))
{w<-segs$is.cand!=1 & segs$ID!=segs$ID[i] & segs$ID %in% samnms[tumors] & segs$is.physio==0 & segs$chrom==segs$chrom[i] 
if (any(w)) 
{
segs$CNAs[i]<-length(unique(segs$ID[w][!(segs$loc.start[w]>segs$loc.end[i] | segs$loc.end[w]<segs$loc.start[i]) & segs$state[w]==segs$state[i]] ))
}
}



##literature
segs$in.lit<-0
segs$in.litN<-0
for (i in which(segs$is.cand==1))
{w<-which(map$chrom==segs$chrom[i] & map$maploc>=segs$loc.start[i] &map$maploc<=segs$loc.end[i] )
in.lit<-in.litN<-NULL
for (jj in w)
{in.lit<-c(in.lit,sum(map$maploc[jj]>=adamscnv09$Start & map$maploc[jj]<=adamscnv09$End & map$chrom[jj]==adamscnv09$Chr)) 
in.litN<-c(in.litN,sum(adamscnv09$TotalGainLossInv[map$maploc[jj]>=adamscnv09$Start & map$maploc[jj]<=adamscnv09$End & map$chrom[jj]==adamscnv09$Chr],na.rm=T)) 

}

segs$in.lit[i]<-mean(in.lit,na.rm=T)
segs$in.litN[i]<-mean(in.litN,na.rm=T)
 
}

##closeness to centromere/telomere

TCmax<-2000000

segs$is.close.end<-0

segs$is.close.centro<-0
for (chr in chrlist)
{i<-which(chrinfo$Chrom==chr)
segs$is.close.end[segs$chrom==chr & ((segs$loc.start-chrinfo$TelomereLeftEnd[i]>=0 & segs$loc.start-chrinfo$TelomereLeftEnd[i]<=TCmax) | (  chrinfo$TelomereRightStart[i]-segs$loc.end>=0 & chrinfo$TelomereRightStart[i]-segs$loc.end<=TCmax) )] <-1
segs$is.close.centro[segs$chrom==chr & ((segs$loc.start-chrinfo$centromereEnd[i]>=0 & segs$loc.start-chrinfo$centromereEnd[i]<=TCmax) | (chrinfo$centromereStart[i]-segs$loc.end>=0 & chrinfo$centromereStart[i]-segs$loc.end<=TCmax))] <-1
}

## is breaking
segs$breaks<-0
for (i in tumors)
for (chr in (1:22))
{
w<-which(segs$chrom==chr & segs$ID==samnms[i])
x<-segs$seg.mean[w][which(segs$is.cand[w]==1)-1]
if (any(which(segs$is.cand[w]==1)==1)) x<-c(0,x)
y<-segs$seg.mean[w][which(segs$is.cand[w]==1)+1]
y[is.na(y)]<-0
segs$breaks[w][segs$is.cand[w]==1]<-abs(x-y)
if (length(x)!=length(y)) {print(c(i,chr)); stop()}
}
segs$breaks<-segs$breaks/rep(mads,table(segs$ID)[match(unique(segs$ID),names(table(segs$ID)))])


##
segs$sign<-sign(segs$seg.mean)
segs$absmean<-abs(segs$seg.mean)

segs$absmeanSD<-abs(segs$seg.mean)/rep(mads,table(segs$ID)[match(unique(segs$ID),names(table(segs$ID)))] )

## 
## is there another candidate within 500 kb away
Maxd<-500000  # 500 Kb
segs$close.cand<-0

for (i in tumors)
for (chr in (1:22))
{w<-which(segs$chrom==chr & segs$ID==samnms[i])
nn<-which(segs$is.cand[w]==1)
if (length(nn)>1)
{for (ii in c(1:length(nn)))
{
if (ii==1) {if (segs$loc.start[w][nn[ii+1]]-segs$loc.end[w][nn[ii]]<=Maxd) segs$close.cand[w][nn[ii]]<-1}
 if (ii==length(nn)) { if( segs$loc.start[w][nn[ii]]-segs$loc.end[w][nn[ii-1]]<=Maxd) segs$close.cand[w][nn[ii]]<-1}
 if (ii!=1 & ii!=length(nn)) {if  (segs$loc.start[w][nn[ii]]-segs$loc.end[w][nn[ii-1]]<=Maxd | segs$loc.start[w][nn[ii+1]]-segs$loc.end[w][nn[ii]]<=Maxd) segs$close.cand[w][nn[ii]]<-1 }
}
}
}

## 

segs$other.cand.lit<-0

for (i in tumors)
for (chr in (1:22))
{segs1<-segs[segs$chrom==chr & segs$ID==samnms[i],]
if (sum(segs1$is.cand==1)>1)
for (w in which(segs1$is.cand==1))
 segs$other.cand.lit[segs$chrom==chr & segs$ID==samnms[i] ][w]<-max(0,mean(segs1$in.lit[which(segs1$is.cand==1)[which(segs1$is.cand==1)!=w]],na.rm=T),na.rm=T)
}



## surrounded by NGL?
segs$surrnd<-0

for (i in tumors)
for (chr in (1:22))
{w<-which(segs$chrom==chr & segs$ID==samnms[i])
x<-segs$state[w][which(segs$is.cand[w]==1)-1]
if (any(which(segs$is.cand[w]==1)==1)) x<-c(NA,x)
y<-segs$state[w][which(segs$is.cand[w]==1)+1]
x[is.na(x)]<-y[is.na(x)]
y[is.na(y)]<-x[is.na(y)]
segs$surrnd[w][segs$is.cand[w]==1][x=="Normal" & y=="Normal"]<-"NN"
segs$surrnd[w][segs$is.cand[w]==1][(x=="Normal" & y!="Normal") | (x!="Normal" & y=="Normal")]<-"NGL"
segs$surrnd[w][segs$is.cand[w]==1][x!="Normal" & y!="Normal"]<-"GL"
if (length(x)!=length(y)) {print(c(i,chr)); stop()}
}


segs$surrnd<-as.factor(as.character(segs$surrnd))

segs$in.otherpts<-0
for (i in which(segs$is.cand==1))
{w<-segs$is.cand==1 & segs$ID!=segs$ID[i] &  segs$ID %in% samnms[tumors] & segs$chrom==segs$chrom[i]
if (any(w)) 
{
segs$in.otherpts[i]<-length(unique(segs$ID[w][!(segs$loc.start[w]>segs$loc.end[i] | segs$loc.end[w]<segs$loc.start[i])] ))/(length(samnms)-1)
}
}

segs$percentN<-0
for (chr in c(1:22))
for (j in samnms)
{w<-which(segs$ID==j & segs$chrom==chr )
segs$percentN[w][segs$is.cand[w]==1]<-sum(segs$num.mark[w][segs$state[w]=="Normal"])/sum(segs$num.mark[w])
}

segs$in.otherptsE<-0

for (i in which(segs$is.cand==1))
{w<-segs$is.cand==1 & segs$ID!=segs$ID[i] &  segs$ID %in% samnms[tumors] & segs$chrom==segs$chrom[i]
if (any(w)) 
{
segs$in.otherptsE[i]<-length(unique(segs$ID[w][(segs$loc.start[w]==segs$loc.start[i] | segs$loc.end[w]==segs$loc.end[i])] ))/(length(samnms)-1)

}
}



return(segs)
}
